library(ggplot2)
library(scales)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
library(maps)
library(mapdata)
transfer_data <- read.csv("merged_cleaned_data.csv")
head(transfer_data)
##         club                    name age   nationality           position
## 1 Arsenal FC Alex Oxlade-Chamberlain  17       England   Central Midfield
## 2 Arsenal FC                Gervinho  24 Cote d'Ivoire        Left Winger
## 3 Arsenal FC            Mikel Arteta  29         Spain   Central Midfield
## 4 Arsenal FC         Per Mertesacker  26       Germany        Centre-Back
## 5 Arsenal FC            André Santos  28        Brazil Attacking Midfield
## 6 Arsenal FC          Chu-young Park  26  Korea, South     Centre-Forward
##   short_pos market_value  dealing_club dealing_country      fee movement window
## 1        CM      2500000   Southampton         England 13800000       in summer
## 2        LW     15000000    LOSC Lille          France 12000000       in summer
## 3        CM     15000000       Everton         England 12000000       in summer
## 4        CB     12500000 Werder Bremen         Germany 11300000       in summer
## 5        AM      8500000    Fenerbahce          Turkey  7000000       in summer
## 6        CF      7000000        Monaco          France  6500000       in summer
##           league season is_loan    loan_status
## 1 Premier League   2011   False Not Applicable
## 2 Premier League   2011   False Not Applicable
## 3 Premier League   2011   False Not Applicable
## 4 Premier League   2011   False Not Applicable
## 5 Premier League   2011   False Not Applicable
## 6 Premier League   2011   False Not Applicable
str(transfer_data)
## 'data.frame':    17880 obs. of  16 variables:
##  $ club           : chr  "Arsenal FC" "Arsenal FC" "Arsenal FC" "Arsenal FC" ...
##  $ name           : chr  "Alex Oxlade-Chamberlain" "Gervinho" "Mikel Arteta" "Per Mertesacker" ...
##  $ age            : num  17 24 29 26 28 26 19 19 24 24 ...
##  $ nationality    : chr  "England" "Cote d'Ivoire" "Spain" "Germany" ...
##  $ position       : chr  "Central Midfield" "Left Winger" "Central Midfield" "Centre-Back" ...
##  $ short_pos      : chr  "CM" "LW" "CM" "CB" ...
##  $ market_value   : num  2500000 15000000 15000000 12500000 8500000 ...
##  $ dealing_club   : chr  "Southampton" "LOSC Lille" "Everton" "Werder Bremen" ...
##  $ dealing_country: chr  "England" "France" "England" "Germany" ...
##  $ fee            : int  13800000 12000000 12000000 11300000 7000000 6500000 1130000 1000000 34000000 27500000 ...
##  $ movement       : chr  "in" "in" "in" "in" ...
##  $ window         : chr  "summer" "summer" "summer" "summer" ...
##  $ league         : chr  "Premier League" "Premier League" "Premier League" "Premier League" ...
##  $ season         : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ is_loan        : chr  "False" "False" "False" "False" ...
##  $ loan_status    : chr  "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
unique_positions <- unique(transfer_data$short_pos)
print(unique_positions)
##  [1] "CM"       "LW"       "CB"       "AM"       "CF"       "RB"      
##  [7] "LB"       "RW"       "GK"       "LM"       "DM"       "RM"      
## [13] "SS"       "defence"  "midfield" "attack"
transfer_data <- transfer_data %>%
  mutate(position_category = case_when(
    short_pos %in% c("CM", "LM", "AM", "DM", "RM") ~ "midfield",
    short_pos %in% c("RB", "LB", "CB") ~ "defence",
    short_pos %in% c("SS", "LW", "RW", "CF") ~ "attack",
    TRUE ~ short_pos  
  ))

head(transfer_data)
##         club                    name age   nationality           position
## 1 Arsenal FC Alex Oxlade-Chamberlain  17       England   Central Midfield
## 2 Arsenal FC                Gervinho  24 Cote d'Ivoire        Left Winger
## 3 Arsenal FC            Mikel Arteta  29         Spain   Central Midfield
## 4 Arsenal FC         Per Mertesacker  26       Germany        Centre-Back
## 5 Arsenal FC            André Santos  28        Brazil Attacking Midfield
## 6 Arsenal FC          Chu-young Park  26  Korea, South     Centre-Forward
##   short_pos market_value  dealing_club dealing_country      fee movement window
## 1        CM      2500000   Southampton         England 13800000       in summer
## 2        LW     15000000    LOSC Lille          France 12000000       in summer
## 3        CM     15000000       Everton         England 12000000       in summer
## 4        CB     12500000 Werder Bremen         Germany 11300000       in summer
## 5        AM      8500000    Fenerbahce          Turkey  7000000       in summer
## 6        CF      7000000        Monaco          France  6500000       in summer
##           league season is_loan    loan_status position_category
## 1 Premier League   2011   False Not Applicable          midfield
## 2 Premier League   2011   False Not Applicable            attack
## 3 Premier League   2011   False Not Applicable          midfield
## 4 Premier League   2011   False Not Applicable           defence
## 5 Premier League   2011   False Not Applicable          midfield
## 6 Premier League   2011   False Not Applicable            attack
unique_positions <- unique(transfer_data$position_category)
print(unique_positions)
## [1] "midfield" "attack"   "defence"  "GK"
filtered_data <- transfer_data[transfer_data$fee > 0, ]
transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]

ggplot(transfer_data_2021, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  labs(title = "Relationship between Market Value and Transfer Fee (All Leagues - 2021)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League")

ggplot(transfer_data, aes(x = age, fill = ..count..)) +
  geom_histogram(binwidth = 1, color = "black", alpha = 0.7) +
  scale_fill_gradient("Count", low = "yellow", high = "purple") +
  labs(title = "Transfer Distribution by Age from 2011 to 2020",
       x = "Age",
       y = "Count") +
  theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(transfer_data, aes(x = age, y = fee, color = league)) +
  geom_point(alpha = 0.7) +
  #scale_color_manual(values = league_colors, name = "League") +
  labs(title = "Transfer Fees versus Age",
       x = "Age",
       y = "Transfer Fee",
       color = "League") +
  theme_minimal()

# x_limits =c(0,12500000)
# y_limits =c(0,15000000)
## faceted by league

ggplot(filtered_data, aes(x = market_value, y = fee)) +
  geom_point() +
  labs(title = "Relationship between Market Value and Transfer Fee (All Leagues)",
       x = "Market Value",
       y = "Transfer Fee") + facet_wrap(~ league)

ggplot(filtered_data, aes(x = log(market_value), y = log(fee))) +
  geom_point() +
  labs(title = "Relationship between log(Market Value) and log(Transfer Fee) (All Leagues)",
       x = "Market Value",
       y = "Transfer Fee") +
  facet_wrap(~ league, scales = "free")  

ggplot(filtered_data, aes(x = market_value, y = fee, color = position_category)) +
  geom_point(aes(group = position_category), size = 1) +
  #geom_point(size = 1) +
  labs(
    title = "Scatter Plot - Market Value and Position on Transfer Fees",
    x = "Market Value",
    y = "Transfer Fee",
    color = "Position"
  ) + 
  facet_wrap(~position_category)

##Log Scale
ggplot(filtered_data, aes(x = log(market_value), y = log(fee), color = position_category)) +
  geom_point(aes(group = position_category), size = 1) +
  #geom_point(size = 1) +
  labs(
    title = "Scatter Plot - log(Market Value) and Position on log(Transfer Fees)",
    x = "Market Value",
    y = "Transfer Fee",
    color = "Position"
  ) + 
  facet_wrap(~position_category)

# Filter the data for the year 2021 and exclude players with a fee of 0
transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]

# Scatter plot with color-coded points for each league in the year 2021
ggplot(transfer_data, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add a linear regression line
  labs(title = "Relationship between Market Value and Transfer Fee (All Leagues)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") + 
  facet_grid(league ~ .)
## `geom_smooth()` using formula = 'y ~ x'

# Create a new variable for year categories
filtered_data$year_category <- cut(filtered_data$season, breaks = c(2011, 2014, 2018, 2021), labels = c("2011-2014", "2015-2018", "2019-2021"))

# Filter out rows with NA in year_category
filtered_data <- filtered_data[!is.na(filtered_data$year_category), ]

# Visualize the scatter plot with categorized years
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
  labs(title = "Market Value vs Transfer Fees - Categorized Years",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") +
  facet_grid(league ~ year_category)

# Filter the data for the year 2021 and exclude players with a fee of 0
transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]

# Scatter plot with color-coded points for each league in the year 2021
ggplot(transfer_data_2021, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add a linear regression line
  labs(title = "Relationship between Market Value and Transfer Fee (All League) 2021",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") + 
  facet_grid(league ~ .)
## `geom_smooth()` using formula = 'y ~ x'

transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]

# Define unique colors for each league
league_colors <- c("Premier League" = "red", "Laliga" = "blue", "1 Bundesliga" = "green", "Serie A" = "purple", "Ligue 1" = "orange")

# Scatter plot with color-coded points and separate trend lines for each league
ggplot(transfer_data_2021, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, aes(group = league, color = league)) +  # Separate trend lines for each league with different colors
  scale_color_manual(values = league_colors, name = "League") +  # Set colors according to league
  labs(title = "Relationship between Market Value and Transfer Fee (All Leagues - 2021)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League")
## `geom_smooth()` using formula = 'y ~ x'

create_scatter_plot <- function(data, title) {
  ggplot(data, aes(x = market_value, y = fee, color = league)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE, aes(group = league, color = league)) +
    scale_color_manual(values = league_colors, name = "League") +
    labs(title = title,
         x = "Market Value",
         y = "Transfer Fee",
         color = "League")
}

transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]
plot_2021 <- create_scatter_plot(transfer_data_2021, "Relationship between Market Value and Transfer Fee (All Leagues - 2021)")

transfer_data_2016 <- transfer_data[transfer_data$season == "2016" & transfer_data$fee > 0, ]
plot_2016 <- create_scatter_plot(transfer_data_2016, "Relationship between Market Value and Transfer Fee (All Leagues - 2016)")

transfer_data_2011 <- transfer_data[transfer_data$season == "2011" & transfer_data$fee > 0, ]
plot_2011 <- create_scatter_plot(transfer_data_2011, "Relationship between Market Value and Transfer Fee (All Leagues - 2011)")

par(mfrow = c(1, 3))

plot_2021
## `geom_smooth()` using formula = 'y ~ x'

plot_2016
## `geom_smooth()` using formula = 'y ~ x'

plot_2011
## `geom_smooth()` using formula = 'y ~ x'

# Calculate the correlation coefficient
correlation_coefficient <- cor(filtered_data$market_value, filtered_data$fee)

# Print the correlation coefficient
cat("Correlation Coefficient:", correlation_coefficient, "\n")
## Correlation Coefficient: 0.8706092

Using filtered_data:

model <- lm(fee ~ market_value + league, data = filtered_data)

# Plot
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  geom_abline(intercept = coef(model)[1], slope = coef(model)[2], color = "blue") +
  labs(title = "Relationship between Market Value and Transfer Fee (All League)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") + 
  facet_grid(league ~ .)

model2 <- lm(fee ~ market_value + short_pos, data = filtered_data)

# Plot
ggplot(filtered_data, aes(x = market_value, y = fee, color = position_category)) +
  geom_point() +
  geom_abline(intercept = coef(model)[1], slope = coef(model)[2], color = "blue") +
  labs(title = "Relationship between Market Value and Transfer Fee (All League)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") + 
  facet_grid(league ~ .)

model1 <- lm(fee ~ market_value + short_pos + league + season, data = filtered_data)

ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +  # Add a linear regression line
  labs(title = "Relationship between Market Value and Transfer Fee (Model)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") +
  facet_grid(league ~ .)

MODEL WE CHOSE

Linear Regression with Age Categories

# Create age categories
filtered_data$age_category <- cut(filtered_data$age, breaks = c(18, 21, 25, 30, 35, 40, 45, 50, 100),
                                   labels = c("18-21", "22-25", "26-30", "31-35", "36-40", "41-45", "46-50", "50+"))

# Fit a linear regression model with age categories
model_age_cat <- lm(fee ~ market_value + short_pos + league + season + as.factor(age_category), data = filtered_data)

# Visualize the relationship
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
  labs(title = "Relationship between Market Value and Transfer Fee (Model with Age Categories)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") + 
  facet_grid(league ~ .)

ANALYSIS OF CHOSEN MODEL

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(model_age_cat)
##                             GVIF Df GVIF^(1/(2*Df))
## market_value            1.111339  1        1.054201
## short_pos               1.106947 15        1.003393
## league                  1.115621  4        1.013770
## season                  1.061856  1        1.030464
## as.factor(age_category) 1.092133  4        1.011077
# Add predictions and actual values to the dataset
filtered_data$predictions_age_cat <- predict(model_age_cat, newdata = filtered_data)
filtered_data$actual_values <- filtered_data$fee

# Visualize predictions vs. actual values
ggplot(filtered_data, aes(x = actual_values, y = predictions_age_cat, color = league)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Predictions vs. Actual Transfer Fees (Model with Age Categories)",
       x = "Actual Transfer Fee",
       y = "Predicted Transfer Fee",
       color = "League")
## Warning: Removed 326 rows containing missing values (`geom_point()`).

residuals_age_cat <- residuals(model_age_cat)

# Assuming log transformation is needed for market_value
filtered_data$log_market_value <- log(filtered_data$market_value)

# Ensure both data frames have the same number of rows
min_rows <- min(nrow(filtered_data), length(residuals_age_cat))
filtered_data <- head(filtered_data, min_rows)
residuals_data <- data.frame(
  log_market_value = filtered_data$log_market_value,
  residuals = head(residuals_age_cat, min_rows),
  league = filtered_data$league
)

# Visualize residuals
ggplot(residuals_data, aes(x = log_market_value, y = residuals, color = league)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs. Log-transformed Market Value (Model with Age Categories)",
       x = "Log-transformed Market Value",
       y = "Residuals",
       color = "League") +
  facet_grid(league ~ .)

# Predict transfer fees
predictions_age_cat <- predict(model_age_cat, newdata = filtered_data)

# Evaluate accuracy
accuracy <- sqrt(mean((filtered_data$fee - predictions_age_cat)^2))
print(paste("Root Mean Squared Error (RMSE):", round(accuracy, 2)))
## [1] "Root Mean Squared Error (RMSE): NA"
# Check for missing values
any(is.na(filtered_data$fee))
## [1] FALSE
any(is.na(predictions_age_cat))
## [1] TRUE
# Check unique values and missing values in age_category
table(filtered_data$age_category, useNA = "always")
## 
## 18-21 22-25 26-30 31-35 36-40 41-45 46-50   50+  <NA> 
##  2156  3747  2765   612    22     0     0     0   320
# Remove rows with missing age_category
filtered_data <- na.omit(filtered_data, cols = "age_category")
# Predict transfer fees
filtered_data$predictions_age_cat <- predict(model_age_cat, newdata = filtered_data)

# Evaluate accuracy
accuracy <- sqrt(mean((filtered_data$fee - filtered_data$predictions_age_cat)^2))
print(paste("Root Mean Squared Error (RMSE):", round(accuracy, 2)))
## [1] "Root Mean Squared Error (RMSE): 5715280.17"

ANALYSIS USING PREDICTED VALUES

Positional Analysis using position_category:

# Create a summary table for average predicted transfer fees by position category
position_category_summary <- aggregate(predictions_age_cat ~ position_category, data = filtered_data, FUN = mean)

Comparative Analysis for Position Categories:

library(ggplot2)
ggplot(filtered_data, aes(x = position_category, y = predictions_age_cat, fill = position_category)) +
  geom_boxplot() +
  labs(title = "Comparison of Predicted Transfer Fees by Position Category",
       x = "Position Category",
       y = "Predicted Transfer Fee",
       fill = "Position Category")

Positional Trends Over Time by Category:

ggplot(filtered_data, aes(x = season, y = predictions_age_cat, color = position_category)) +
  geom_line() +
  labs(title = "Position Category Trends in Predicted Transfer Fees Over Time",
       x = "Season",
       y = "Predicted Transfer Fee",
       color = "Position Category")

Positional Influence in Different Leagues by Category:

ggplot(filtered_data, aes(x = position_category, y = predictions_age_cat, color = league)) +
  geom_point() +
  facet_wrap(~league) +
  labs(title = "Impact of Position Category on Predicted Transfer Fees by League",
       x = "Position Category",
       y = "Predicted Transfer Fee",
       color = "League")

Top Value Position Categories:

# Identify top position categories by average predicted transfer fees
top_position_categories <- position_category_summary[order(position_category_summary$predictions, decreasing = TRUE), ]

top_position_categories
##   position_category predictions_age_cat
## 1            attack             8602416
## 4          midfield             7051186
## 2           defence             5901163
## 3                GK             4554635

Club Analysis using Club:

# Create a summary table for average predicted transfer fees by club
club_summary <- aggregate(predictions_age_cat ~ club, data = filtered_data, FUN = mean)

Top Value Clubs: which football clubs, on average, tend to have higher predicted transfer fees:

# Identify top clubs by average predicted transfer fees
top_clubs <- club_summary[order(club_summary$predictions, decreasing = TRUE), ]
top_clubs
##                         club predictions_age_cat
## 28              Brentford FC          35321225.9
## 92         Manchester United          23203024.6
## 57              FC Barcelona          23135562.2
## 40                Chelsea FC          20839961.4
## 113              Real Madrid          20783452.9
## 101      Paris Saint-Germain          19296946.0
## 85              Leeds United          18922673.6
## 23             Bayern Munich          18232087.0
## 21           Atlético Madrid          17758314.7
## 91           Manchester City          16944361.5
## 12                Arsenal FC          16609382.5
## 88              Liverpool FC          16188908.0
## 144        Tottenham Hotspur          14796638.7
## 26         Borussia Dortmund          14701303.8
## 55                Everton FC          14365181.5
## 86            Leicester City          12751318.9
## 95          Newcastle United          12438494.6
## 14                 AS Monaco          12437859.7
## 163  Wolverhampton Wanderers          12072863.8
## 20           Athletic Bilbao          11504204.2
## 106               RB Leipzig          11350145.0
## 127           Southampton FC          10911381.5
## 153              Valencia CF          10457774.4
## 124               Sevilla FC          10272890.9
## 44            Crystal Palace          10151282.3
## 84               Juventus FC           9911073.0
## 114            Real Sociedad           9891309.7
## 161          West Ham United           9866030.7
## 7                   AC Milan           9603055.0
## 99            Olympique Lyon           9400545.9
## 18               Aston Villa           9043756.0
## 22       Bayer 04 Leverkusen           8382062.6
## 133               SSC Napoli           8351613.5
## 83               Inter Milan           8302543.1
## 158            Villarreal CF           8295922.4
## 100      Olympique Marseille           8290986.6
## 72                 Fulham FC           8153811.3
## 16                   AS Roma           8126518.4
## 104      Queens Park Rangers           8094089.2
## 159               Watford FC           7997975.1
## 125         Sheffield United           7682641.8
## 112      Real Betis Balompié           7628297.0
## 142             Swansea City           7557900.4
## 107                  RC Lens           7396897.1
## 31                Burnley FC           7384386.0
## 30    Brighton & Hove Albion           7215347.8
## 162           Wigan Athletic           7167283.6
## 89                LOSC Lille           7038928.0
## 11         Arminia Bielefeld           7022720.0
## 138               Stoke City           6844941.8
## 157            VfL Wolfsburg           6774211.4
## 27  Borussia Mönchengladbach           6632234.9
## 139           Sunderland AFC           6622662.9
## 160     West Bromwich Albion           6397311.3
## 135        Stade Brestois 29           6388202.4
## 9            AFC Bournemouth           6299866.7
## 82                 Hull City           6241744.7
## 36              Cardiff City           6206134.1
## 63         FC Internazionale           6034000.6
## 67             FC Schalke 04           5981894.6
## 109   RCD Espanyol Barcelona           5630434.0
## 51       Eintracht Frankfurt           5609588.4
## 80                Hertha BSC           5601014.8
## 3          1.FC Union Berlin           5564262.9
## 8             ACF Fiorentina           5563297.6
## 81         Huddersfield Town           5561150.7
## 97              Norwich City           5510506.1
## 93          Middlesbrough FC           5462336.4
## 54              ESTAC Troyes           5433074.8
## 61     FC Girondins Bordeaux           5292292.6
## 148            UD Las Palmas           5138195.3
## 137         Stade Rennais FC           5100938.7
## 98                  OGC Nice           4997094.3
## 94           Montpellier HSC           4935582.3
## 90                 Málaga CF           4785305.9
## 39             Celta de Vigo           4691938.1
## 132                 SS Lazio           4670241.4
## 111               Reading FC           4612114.7
## 145      TSG 1899 Hoffenheim           4585713.4
## 146             UC Sampdoria           4488508.8
## 69               FC Toulouse           4462522.2
## 19               Atalanta BC           4291105.9
## 17          AS Saint-Étienne           4244453.3
## 156               VfL Bochum           4232146.2
## 77              Hamburger SV           3936806.8
## 38                CD Leganés           3903586.7
## 152              US Sassuolo           3890651.9
## 136              Stade Reims           3824004.1
## 108     RC Strasbourg Alsace           3816513.4
## 46          Deportivo Alavés           3785342.9
## 128                     SPAL           3753044.3
## 10                 Amiens SC           3720518.3
## 105           Rayo Vallecano           3589815.2
## 15         AS Nancy-Lorraine           3551603.5
## 68    FC Sochaux-Montbéliard           3471259.9
## 119              SC Freiburg           3458993.9
## 110             RCD Mallorca           3446520.2
## 66                 FC Nantes           3441685.8
## 65                   FC Metz           3418152.0
## 87                Levante UD           3416781.4
## 96           Nîmes Olympique           3379720.4
## 143                Torino FC           3376155.5
## 47    Deportivo de La Coruña           3370731.6
## 149           Udinese Calcio           3348998.2
## 115       Real Valladolid CF           3329160.5
## 134           Stade Brest 29           3323282.5
## 64                FC Lorient           3279134.4
## 73                 Genoa CFC           3268296.1
## 76                Granada CF           3254832.1
## 122                 SD Eibar           3254778.8
## 121               SCO Angers           3213089.5
## 141         SV Werder Bremen           3191723.7
## 74                 Getafe CF           3175509.9
## 34           Cagliari Calcio           3164896.4
## 4             1.FSV Mainz 05           3138964.2
## 1                 1. FC Köln           3119197.5
## 154          Valenciennes FC           3112111.0
## 25           Bologna FC 1909           3046718.0
## 48                 Dijon FCO           2863311.6
## 42          Clermont Foot 63           2730241.6
## 78               Hannover 96           2712035.5
## 126                  SM Caen           2668148.3
## 151               US Palermo           2604267.2
## 2             1.FC Nuremberg           2554642.6
## 155            VfB Stuttgart           2533119.3
## 35            Calcio Catania           2485265.9
## 60  FC Évian Thonon Gaillard           2431861.3
## 118                SC Bastia           2352141.2
## 75                 Girona FC           2342926.5
## 59                 FC Empoli           2319020.6
## 56               FC Augsburg           2259524.0
## 49               EA Guingamp           2138078.3
## 52                  Elche CF           2092230.1
## 53              ES Troyes AC           2039228.4
## 70        Fortuna Düsseldorf           2011225.2
## 32                CA Osasuna           1924490.5
## 79             Hellas Verona           1904339.4
## 45      Delfino Pescara 1936           1772364.9
## 33                  Cádiz CF           1747762.6
## 123                SD Huesca           1739582.1
## 147               UD Almería           1699063.5
## 117              Robur Siena           1698947.9
## 41             Chievo Verona           1687378.8
## 103                 Parma FC           1668484.7
## 116            Real Zaragoza           1556344.1
## 130           Sporting Gijón           1418699.2
## 58                FC Crotone           1383945.5
## 62          FC Ingolstadt 04           1326222.0
## 24          Benevento Calcio           1257524.7
## 29            Brescia Calcio           1142497.2
## 102        Parma Calcio 1913           1139757.7
## 120          SC Paderborn 07           1009378.1
## 6                  AC Cesena            956585.8
## 50    Eintracht Braunschweig            861778.9
## 37             Carpi FC 1909            859373.7
## 71          Frosinone Calcio            741295.0
## 129                SPAL 2013            737403.8
## 13                AS Livorno            691245.1
## 140          SV Darmstadt 98            635479.7
## 131     SpVgg Greuther Fürth            634340.9
## 43                Córdoba CF            551969.2
## 5                 AC Ajaccio            463712.7
## 150                 US Lecce            422402.0

Nationality Analysis using Nationality:

# Create a summary table for average predicted transfer fees by nationality
nationality_summary <- aggregate(predictions_age_cat ~ nationality, data = filtered_data, FUN = mean)
library(ggplot2)

# Filter the top 10 nationalities
top_nationalities <- nationality_summary %>%
  top_n(10, wt = predictions_age_cat)

# Bar chart for the top 10 nationalities
ggplot(top_nationalities, aes(x = reorder(nationality, predictions_age_cat), y = predictions_age_cat)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 10 Nationalities - Predicted Transfer Fees",
       x = "Nationality",
       y = "Predicted Transfer Fee")

# World map 

# Create a world map
world_map <- map_data("world")

library(maps)
ggplot(filtered_data, aes(map_id = nationality)) +
  geom_map(aes(fill = predictions_age_cat), map = world_map) +
  expand_limits(x = world_map$long, y = world_map$lat) +
  theme_void() +
  labs(title = "Predicted Transfer Fees by Nationality on World Map",
       fill = "Predicted Transfer Fee")

# Identify the top nationalities by average predicted transfer fees
top_nationalities <- head(nationality_summary[order(-nationality_summary$predictions), ], 10)

# Print the top nationalities
print(top_nationalities)
##                  nationality predictions_age_cat
## 5                    Armenia            25573507
## 19                   Burundi            17992459
## 11                   Belgium            15044088
## 68                     Kenya            14196397
## 100                 Portugal            13603025
## 40                     Egypt            13354175
## 37        Dominican Republic            12847806
## 23  Central African Republic            11617273
## 47                     Gabon            11512605
## 27                  Colombia            10986528

Quantifying Difference Between Leagues Over Time

# Linear regression model with league, season, and league:season interaction
model3 <- lm(fee ~ market_value + short_pos + league * season, data = filtered_data)

# Visualize the league differences over time
ggplot(filtered_data, aes(x = season, y = fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
  labs(title = "Difference in Transfer Fees Between Leagues Over Time",
       x = "Season",
       y = "Transfer Fee",
       color = "League") +
  facet_grid(league ~ .)

Adding Interaction Terms

# Model with interaction terms
model_interaction <- lm(fee ~ market_value * short_pos * league * season * as.factor(age_category), data = filtered_data)

# Visualization
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +  
  labs(title = "Model with Interaction Terms",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") +
  facet_grid(league ~ .)

Interaction Model - league:season interaction and log-transformed variables

# Log-transform fee and market value
filtered_data$log_fee <- log(filtered_data$fee)
filtered_data$log_market_value <- log(filtered_data$market_value)

# Model with league:season interaction and log-transformed variables
interaction_model <- lm(log_fee ~ log_market_value + short_pos + league * season, data = filtered_data)

# Visualization
ggplot(filtered_data, aes(x = log_market_value, y = log_fee, color = league)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
  labs(title = "Relationship between Log-transformed Market Value and Log-transformed Transfer Fee",
       x = "Log-transformed Market Value",
       y = "Log-transformed Transfer Fee",
       color = "League") +
  facet_grid(league ~ .)

# Check model summary
summary(interaction_model)
## 
## Call:
## lm(formula = log_fee ~ log_market_value + short_pos + league * 
##     season, data = filtered_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7435 -0.4929  0.0859  0.6103  3.9235 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -1.683e+02  1.927e+01  -8.734  < 2e-16 ***
## log_market_value             6.265e-01  7.378e-03  84.913  < 2e-16 ***
## short_posattack              1.002e+00  3.754e-01   2.670 0.007590 ** 
## short_posCB                 -6.446e-02  4.912e-02  -1.312 0.189443    
## short_posCF                  5.982e-02  4.871e-02   1.228 0.219398    
## short_posCM                 -3.412e-02  5.124e-02  -0.666 0.505504    
## short_posdefence             1.665e+00  5.705e-01   2.919 0.003525 ** 
## short_posDM                 -1.457e-01  5.513e-02  -2.643 0.008227 ** 
## short_posGK                 -4.432e-02  5.894e-02  -0.752 0.452138    
## short_posLB                 -6.524e-02  5.817e-02  -1.122 0.262082    
## short_posLM                 -5.942e-02  1.009e-01  -0.589 0.556064    
## short_posLW                 -4.285e-02  5.783e-02  -0.741 0.458729    
## short_posmidfield            8.839e-01  3.009e-01   2.938 0.003311 ** 
## short_posRB                 -1.277e-01  5.801e-02  -2.202 0.027713 *  
## short_posRM                  1.992e-01  1.183e-01   1.685 0.092109 .  
## short_posRW                  5.238e-02  5.822e-02   0.900 0.368253    
## short_posSS                  1.372e-01  8.991e-02   1.526 0.127116    
## leagueLaliga                 9.565e+01  2.799e+01   3.418 0.000634 ***
## leagueLigue 1                8.852e+01  2.679e+01   3.304 0.000957 ***
## leaguePremier League         9.584e+01  2.620e+01   3.658 0.000255 ***
## leagueSerie A                1.675e+02  2.419e+01   6.925 4.66e-12 ***
## season                       8.615e-02  9.559e-03   9.012  < 2e-16 ***
## leagueLaliga:season         -4.733e-02  1.388e-02  -3.410 0.000653 ***
## leagueLigue 1:season        -4.381e-02  1.329e-02  -3.297 0.000981 ***
## leaguePremier League:season -4.725e-02  1.299e-02  -3.636 0.000278 ***
## leagueSerie A:season        -8.291e-02  1.200e-02  -6.911 5.14e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9838 on 9276 degrees of freedom
## Multiple R-squared:  0.5122, Adjusted R-squared:  0.5108 
## F-statistic: 389.5 on 25 and 9276 DF,  p-value: < 2.2e-16
# Check variance inflation factor (VIF) for multicollinearity
library(car)
vif(interaction_model)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##                          GVIF Df GVIF^(1/(2*Df))
## log_market_value 1.217175e+00  1        1.103257
## short_pos        1.089866e+00 15        1.002873
## league           1.274360e+23  4      772.968011
## season           6.133796e+00  1        2.476650
## league:season    1.274144e+23  4      772.951660
# Add predictions and actual values to the dataset
filtered_data$predictions <- exp(predict(interaction_model))
filtered_data$actual_values <- filtered_data$fee

# Visualize predictions vs. actual values
ggplot(filtered_data, aes(x = actual_values, y = predictions, color = league)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Predictions vs. Actual Transfer Fees",
       x = "Actual Transfer Fee",
       y = "Predicted Transfer Fee",
       color = "League")

# Check residuals
residuals <- residuals(interaction_model)

# Visualize residuals
ggplot(filtered_data, aes(x = log_market_value, y = residuals, color = league)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs. Log-transformed Market Value",
       x = "Log-transformed Market Value",
       y = "Residuals",
       color = "League") +
  facet_grid(league ~ .)

# Evaluate model performance
library(Metrics)
rmse_value <- rmse(filtered_data$actual_values, filtered_data$predictions)
cat("Root Mean Squared Error (RMSE):", rmse_value, "\n")
## Root Mean Squared Error (RMSE): 8735957

AICs

model <- lm(fee ~ market_value + league, data = filtered_data)
model1 <- lm(fee ~ market_value + short_pos + league + season, data = filtered_data)
model2 <- lm(fee ~ market_value + short_pos, data = filtered_data)
model3 <- lm(fee ~ market_value + short_pos + league * season, data = filtered_data)
model_age_cat <- lm(fee ~ market_value + short_pos + league + season + as.factor(age_category), data = filtered_data)
model_interaction <- lm(fee ~ market_value * short_pos * league * season * as.factor(age_category), data = filtered_data)
interaction_model <- lm(log_fee ~ log_market_value + short_pos + league * season, data = filtered_data)




# AIC values
AIC(model, model1, model2, model3, model_age_cat, model_interaction, interaction_model)
##                    df       AIC
## model               7 316192.85
## model1             23 316198.45
## model2             18 316334.64
## model3             27 316193.24
## model_age_cat      27 315901.87
## model_interaction 949 312993.23
## interaction_model  27  26121.24
correlation_coefficient <- cor(filtered_data$market_value, filtered_data$season)

# Print the correlation coefficient
cat("Correlation Coefficient:", correlation_coefficient, "\n")
## Correlation Coefficient: 0.1888159
#model 1 is the lowest
# if i were to suspect interaction it would be between position and season
model1 <- lm(fee ~ market_value + position_category + league + season, data = filtered_data)
lm_all <- lm(fee ~ market_value + position_category + league + season + age, data = filtered_data)
lm_age_val <- lm(fee ~ market_value*age + position_category + league + season, data = filtered_data)
lm_age_val2 <- lm(fee ~ market_value*age + position_category + league*season, data = filtered_data)

AIC(model1, lm_all, lm_age_val,lm_age_val2)
##             df      AIC
## model1      11 316183.5
## lm_all      12 315878.7
## lm_age_val  13 315631.2
## lm_age_val2 17 315623.2
ggplot(filtered_data, aes(x = market_value, y = fee, color = position_category)) +
  geom_point() +
  geom_abline(intercept = coef(lm_all)[1], slope = coef(lm_all)[2], color = "purple") +
  labs(title = "Relationship between Market Value and Transfer Fee (All League)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") + 
  facet_grid(league ~ .)

lm_age_val <- lm(fee ~ market_value*age + position_category + league + season, data = filtered_data)


ggplot(filtered_data, aes(x = market_value, y = fee, color = position_category)) +
  geom_point() +
  geom_abline(intercept = coef(lm_age_val)[1], slope = coef(lm_age_val)[2], color = "purple") +
  labs(title = "Relationship between Market Value and Transfer Fee (All League)",
       x = "Market Value",
       y = "Transfer Fee",
       color = "League") + 
  facet_grid(league ~ .)

# Create a GAM model with a nonlinear term in age and market_value
gam_model <- gam(fee ~ s(market_value) + s(age) + position_category + league + s(season),
                 data = filtered_data)


# Plot the GAM model
plot(gam_model, select = 2)  # Select the age smooth term for plotting

# Summary of the GAM model
summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## fee ~ s(market_value) + s(age) + position_category + league + 
##     s(season)
## 
## Parametric coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                6069370     161602  37.558  < 2e-16 ***
## position_categorydefence    170217     143109   1.189  0.23430    
## position_categoryGK         183240     249089   0.736  0.46197    
## position_categorymidfield  -455786     144296  -3.159  0.00159 ** 
## leagueLaliga               1156746     204788   5.649 1.67e-08 ***
## leagueLigue 1               994962     196506   5.063 4.20e-07 ***
## leaguePremier League       2565210     193447  13.261  < 2e-16 ***
## leagueSerie A               747983     176323   4.242 2.24e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df       F p-value    
## s(market_value) 8.959  8.999 3438.57  <2e-16 ***
## s(age)          4.712  5.742   66.02  <2e-16 ***
## s(season)       8.041  8.776   33.91  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.791   Deviance explained = 79.2%
## GCV = 3.006e+13  Scale est. = 2.9964e+13  n = 9302
filtered_data$predicted_fee <- predict(gam_model, newdata = filtered_data)

# Plot the observed data and the fitted values
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
  geom_point(alpha = 1) +  # Scatter plot of observed data
  geom_line(aes(y = predicted_fee), color = "purple", size = 1) +  # Fitted values from the GAM model
  labs(title = "Observed Data vs. Fitted Values (GAM Model)",
       x = "Market Value",
       y = "Transfer Fee")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.